from google.colab import drive
drive.mount('/content/drive/')
The aim of this competition is predict a Book's Price. The model could be used by a Publishing house to determine a price of the new book they plan to launch.
!pip install cloud-tpu-client
!gdown --id 1wg8LXo5UuFWMMP-x5xatcgdwVTKUr6nM # Download the glove embedding file
import os
import pandas as pd
import logging
import numpy as np
import re
import pickle
from importlib.machinery import SourceFileLoader
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sn
import plotly.io as pio
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import (mean_squared_error,\
mean_absolute_error)
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
logging.basicConfig(level='DEBUG')
logger = logging.getLogger()
PROJECT_PATH = os.path.join(os.getcwd(),\
'drive',\
'MyDrive',\
'Datasets',\
'MachineHack',\
'PredictPriceBook')
logger.debug(f'directory has {os.listdir(PROJECT_PATH)}')
LIBRARY_PATH = os.path.join(os.getcwd(),\
'drive',\
'MyDrive',\
'mylib')
logger.debug(f'directory has {os.listdir(LIBRARY_PATH)}')
logger.debug(f"max col width setting is {pd.\
get_option('display.max_colwidth')}")
pd.set_option('display.max_colwidth',100)
logger.debug(f"max columns setting is {pd.\
get_option('display.max_columns')}")
pd.set_option('display.max_columns',100)
logger.debug(f"display.float fomat is {pd.\
get_option('display.float_format')}")
pd.set_option('display.float_format', '{:.3f}'.format)
logger.debug(f'Default renderer is {pio.renderers}')
#Setting renderer
pio.renderers.default = 'notebook_connected'
logger.debug(f'New renderer is {pio.renderers}')
eda = SourceFileLoader('eda',\
os.path.join(LIBRARY_PATH,\
'eda.py')).load_module()
eval = SourceFileLoader('eval',\
os.path.join(LIBRARY_PATH,\
'eval.py')).load_module()
mb = SourceFileLoader('model_building',\
os.path.join(LIBRARY_PATH,\
'model_building.py')).load_module()
fe = SourceFileLoader('feature_engineering',\
os.path.join(LIBRARY_PATH,\
'feature_engineering.py')).load_module()
el = eval.RegressionEvaluation(precision=3)
el.load_data(os.path.join(PROJECT_PATH,'eval','eval.csv'))
logger.debug(f'Here is the loaded metric file \n {el.get_metrics()}')
logger.debug(f'Files in directory are {os.listdir(PROJECT_PATH)}')
dataset = pd.read_excel(os.path.join(PROJECT_PATH,'Data_Train.xlsx'))
test_dataset = pd.read_excel(os.path.join(PROJECT_PATH,'Data_Test.xlsx'))
logger.debug(f'Shape of Train Dataset is {dataset.shape}')
logger.debug(f'Shape of Test Dataset is {test_dataset.shape}')
val_dataset = dataset[dataset['Title'].\
isin(test_dataset['Title'])]
val_index = val_dataset.index
logger.debug(f'val_dataset shape is {val_dataset.shape}')
eda.explain_data(dataset)
Exploring each feature and its behavior with Target
test_dataset.loc[:,'Price'] = -1
merge_dataset = pd.concat([dataset,test_dataset])
merge_dataset = merge_dataset.reset_index()
Price requires a log transformation due to huge range in values
condition = (merge_dataset['Price'] != -1)
fig = px.histogram(merge_dataset[condition],
x='Price')
fig.show()
merge_dataset['LogPrice'] = merge_dataset['Price'].\
apply(lambda x: np.log10(x) if x != -1 else -1)
fig = px.histogram(merge_dataset[condition],
x='LogPrice')
fig.show()
# condition = ((merge_dataset['LogPrice'] > 1.5) |\
# (merge_dataset['LogPrice'] == -1))
# merge_dataset = merge_dataset[condition].copy()
logger.debug(f'Merge_dataset shape is {merge_dataset.shape}')
log_price_bucket = pd.cut(merge_dataset['LogPrice'],\
bins=[-2,0,2.3522,2.51,2.645,2.8459,10],\
labels=['Target',\
'Very Low',\
'Low',\
'Med',\
'High',\
'Very High'])
merge_dataset['LogPriceBucket'] = log_price_bucket
merge_dataset['LogPriceBucket'] = merge_dataset['LogPriceBucket'].astype('object')
logger.debug(f"Price buckets are \n {merge_dataset['LogPriceBucket'].\
value_counts(normalize=True)}")
Analyzing records where Title is repeating tells us that Book prices are varying as per category. There is also presence of certain cases where a books has varying price for different instances while all other fields are same.
#Analyzing top repeats
merge_dataset[merge_dataset['Title'].\
isin(['Casino Royale: James Bond 007 (Vintage)'])].\
sort_values(by='Title')
multi_price_titles = eda.get_multi_value_keys(merge_dataset,'Title','Price')
logger.debug(f'Multi price titles are {multi_price_titles}')
# # Averaging price for observation with all other same fields.
# ivs = list(dataset.columns.drop('Price'))
# logger.debug(f'Columns are {ivs}')
# dataset = dataset.groupby(ivs)['Price'].mean().reset_index()
# logger.\
# debug(f"Grouped merge data is \n {dataset[dataset['Title']== 'Casino Royale: James Bond 007 (Vintage)']}")
wc = eda.create_word_cloud(merge_dataset['Title'],\
stop_words=['book','books'])
Below is the distribution of Sentence length
merge_dataset['title_len'] = merge_dataset['Title'].apply(lambda x: len(x.split()))
train_condition = merge_dataset['LogPrice'] != -1
fig = px.box(x=merge_dataset.loc[train_condition,'title_len'],\
y=merge_dataset.loc[train_condition,'Price'],\
log_y=True)
fig.show()
title_by_author = merge_dataset.groupby('Title').agg({'Author':'nunique'}).reset_index()
titles = title_by_author.loc[title_by_author['Author'] >1,:]
logger.debug(f'Titles with more than 1 author names \n{titles}')
# Handling inconsistencies in Author name
merge_dataset.loc[merge_dataset['Author'] == 'Tom Jenks',\
'Author'] = 'Raymond Carver, Tom Jenks'
merge_dataset.loc[merge_dataset['Title'] == 'The Old Man and the Sea',\
'Author'] = 'Ernest Hemingway'
merge_dataset.loc[merge_dataset['Title'] == 'The Elements of Style',\
'Author'] = 'William Strunk Jr., E. B. White'
merge_dataset.loc[merge_dataset['Title'] == 'Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things That Happened',\
'Author'] = 'Alexandra Brosh'
merge_dataset['Author_clean'] = merge_dataset['Author'].apply(lambda x: re.sub('\s+|\.|\,','',x.lower()))
#Top Authors
pareto = .192
train_dataset = merge_dataset[train_condition].copy()
train_dataset,top_authors = eda.bucketize_pareto(train_dataset,\
'Author_clean',\
'LogPriceBucket',\
pareto)
logger.debug(f'Top authors are \n {top_authors}')
merge_dataset['top_Author'] = merge_dataset.\
apply(lambda x: x['Author_clean'] if x['Author_clean'] in top_authors else 'other_Author',axis=1)
Addding more details to Author such as
train_condition = merge_dataset['LogPrice'] != -1
author_details = (merge_dataset[train_condition].\
groupby('Author_clean').\
agg({'Title':'count',\
'LogPrice':[np.amin,\
np.mean,\
np.amax,\
np.std]}).\
reset_index())
cols = ["Author" + "_" + str(i[0]) + "_" + str(i[1]) for i in author_details.columns]
author_details.columns = cols
#Dropping Books by Authors with less than 5 titles
author_details = author_details[author_details['Author_Title_count'] >= 2]
logger.debug(author_details)
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
author_details,\
left_on ='Author_clean',\
right_on = 'Author_Author_clean_',\
how='left')
logger.debug(merge_dataset.shape)
merge_dataset.drop('Author_Author_clean_',inplace=True,axis=1)
eda.explain_data(merge_dataset)
condition = ((merge_dataset['Price'] != -1))
fig = px.box(merge_dataset[condition],\
x='top_Author',\
y='Price',
log_y=True)
fig.show()
condition = ((merge_dataset['Price'] != -1))
fig = px.scatter(merge_dataset[condition],\
x='Author_LogPrice_mean',\
y='Price',
color = 'Author_Title_count',
log_y=True)
fig.show()
outlier_upper_bound = merge_dataset['Author_LogPrice_mean'] + 4*merge_dataset['Author_LogPrice_std']
outlier_lower_bound = merge_dataset['Author_LogPrice_mean'] - 4*merge_dataset['Author_LogPrice_std']
condition = ((merge_dataset['LogPrice'] > outlier_upper_bound) &\
(merge_dataset['Author_LogPrice_std'] > 0))
logger.debug(f"Outliers are {merge_dataset[condition].shape}")
logger.debug(f"\n{merge_dataset[condition]}")
This format of the Edition where (Language),Bind,-Sourcing,Day,Month,Year
logger.debug(f"Top 5 records in Edition are \n {merge_dataset['Edition'].head()}")
pattern = '(?:(\(\w+\)),)?(\w+(?:-\w+)?(?:\s+\w+)?(?:\s+\w+)?),–\s+(?:(\w+(?:\s+\w+)?),\s+)?(?:(\d+)\s+)?(?:(\w+)\s+)?(?:(\d+))?'
edition = merge_dataset['Edition'].\
apply(lambda x: re.findall(pattern,x))
logger.debug(f'Glimpse of edition \n {edition}')
lan = []
bind = []
sour = []
month = []
year = []
for i in edition:
lan.append(i[0][0])
bind.append(i[0][1])
sour.append(i[0][2])
month.append(i[0][4])
year.append(i[0][5])
logger.debug(f'Unique value for lan are {np.unique(lan)}')
logger.debug(f'Unique value for bind are {np.unique(bind)}')
logger.debug(f'Unique value for sour are {np.unique(sour)}')
logger.debug(f'Unique value for month are {np.unique(month)}')
logger.debug(f'Unique value for year are {np.unique(year)}')
edition_details = pd.DataFrame(list(zip(bind,\
sour,\
month,\
year)),\
columns = ['bind',\
'sour',\
'month',\
'year'])
# Correcting Regex Error
edition_details.loc[edition_details['sour'].\
isin(['']),'sour'] = 'Other_sour'
edition_details.loc[edition_details['month'].\
isin(['Box','Large','Special','']),'month'] = 'Unknown'
edition_details.loc[edition_details['year'].\
isin(['']),'year'] = '2018'
edition_details['month'].unique()
month_label = dict({'Mar': 3,\
'Nov' : 11,\
'Feb' : 2,\
'Oct' : 10,\
'May' : 5,\
'Dec' : 12,\
'Jan' : 1,\
'Jun' : 6,\
'Jul' : 7,\
'Sep' : 9,\
'Unknown': 0,\
'Aug' : 8,
'Apr': 4})
edition_details['month'].replace(month_label,inplace=True)
merge_dataset = pd.concat([merge_dataset,\
edition_details],axis=1)
eda.explain_data(merge_dataset)
Definite pricing trend observed with respect to bind type, Spiral bound being expensive as compared to Hardcover and Paperback. Mass Market Paperback being the cheapest alternative for all 4.
fig = px.box(merge_dataset[merge_dataset['Price'] != -1],\
x='bind',\
y='Price',\
log_y=True)
fig.show()
#Top Binds
pareto = .995
train_dataset = merge_dataset[train_condition].copy()
train_dataset,top_binds = eda.bucketize_pareto(train_dataset,\
'bind',\
'LogPriceBucket',\
pareto)
merge_dataset['top_bind'] = merge_dataset.\
apply(lambda x: x['bind'] if x['bind'] in top_binds else 'other_bind',axis=1)
train_condition = merge_dataset['LogPrice'] != -1
bind_details = (merge_dataset[train_condition].\
groupby('bind').\
agg({'Title':'count',\
'LogPrice':[np.amin,\
np.mean,\
np.amax,\
np.std]}).\
reset_index())
cols = ["bind" + "_" + str(i[0]) + "_" + str(i[1]) for i in bind_details.columns]
bind_details.columns = cols
#Dropping Books by bind with less than 5 titles
bind_details = bind_details[bind_details['bind_Title_count'] >= 2]
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
bind_details,\
left_on='bind',\
right_on = 'bind_bind_',\
how='left'
)
logger.debug(merge_dataset.shape)
merge_dataset.drop('bind_bind_',inplace=True,axis=1)
#Top Sour
pareto = .985
train_dataset = merge_dataset[train_condition].copy()
train_dataset,top_binds = eda.bucketize_pareto(train_dataset,\
'sour',\
'LogPriceBucket',\
pareto)
merge_dataset['top_sour'] = merge_dataset.\
apply(lambda x: x['sour'] if x['sour'] in top_binds else 'other_sour',axis=1)
train_condition = merge_dataset['LogPrice'] != -1
sour_details = (merge_dataset[train_condition].\
groupby('sour').\
agg({'Title':'count',\
'LogPrice':[np.amin,\
np.mean,\
np.amax,\
np.std]}).\
reset_index())
cols = ["sour" + "_" + str(i[0]) + "_" + str(i[1]) for i in sour_details.columns]
sour_details.columns = cols
#Dropping Books by bind with less than 5 titles
sour_details = sour_details[sour_details['sour_Title_count'] >= 2]
logger.debug(sour_details)
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
sour_details,\
left_on='sour',\
right_on = 'sour_sour_',\
how='left'
)
logger.debug(merge_dataset.shape)
merge_dataset.drop('sour_sour_',inplace=True,axis=1)
The price range increases and gets more wide as year of publication increases.
condition = ((merge_dataset['LogPrice'] != -1) &\
(merge_dataset['month'].isin(['Jan','Feb'])))
year_month_price = (merge_dataset[condition].\
groupby(['year',\
'month'])['Price'].\
mean().\
reset_index())
fig = px.scatter(year_month_price,\
x='year',\
y='Price',\
color='month',\
log_y=True,
range_x=(1980,2021))
fig.show()
pattern = '(\d+.\d+)\s+out of 5 stars'
reviews = merge_dataset['Reviews'].\
apply(lambda x: re.findall(pattern,x))
review_num = []
for i in reviews:
review_num.append(i[0])
merge_dataset['review_num'] = review_num
merge_dataset['review_num'] = merge_dataset['review_num'].astype('float64')
Reviews is not neccesarily dependent on the Price and is almost evenly spread around reviews
fig = px.box(merge_dataset[merge_dataset['Price'] != -1],\
x='review_num',\
y='Price',\
log_y=True)
fig.show()
merge_dataset['rating_num'] = (merge_dataset['Ratings'].\
apply(lambda x: x.split()[0]).
apply(lambda x: x.replace(',','')))
merge_dataset['rating_num'] = merge_dataset['rating_num'].astype('float64')
Popular books with High Rating have specific price range, hinting towards the most selling price range
fig = px.scatter(merge_dataset[merge_dataset['Price'] != -1],\
x='rating_num',\
y='Price',\
log_y=True,
log_x=True)
fig.show()
merge_dataset['rating_num'] = merge_dataset['rating_num'].\
apply(lambda x: np.log10(x+10))
#Top Genres
pareto = .48
train_dataset = merge_dataset[train_condition].copy()
train_dataset,top_genres = eda.bucketize_pareto(train_dataset,\
'Genre',\
'LogPriceBucket',\
pareto)
merge_dataset['top_Genre'] = merge_dataset.\
apply(lambda x: x['Genre'] if x['Genre'] in top_genres else 'other_Genre',axis=1)
train_condition = merge_dataset['LogPrice'] != -1
Genre_details = (merge_dataset[train_condition].\
groupby('Genre').\
agg({'Title':'count',\
'LogPrice':[np.amin,\
np.mean,\
np.amax,\
np.std]}).\
reset_index())
cols = ["Genre" + "_" + str(i[0]) + "_" + str(i[1]) for i in Genre_details.columns]
Genre_details.columns = cols
#Dropping Books by Authors with less than 5 titles
Genre_details = Genre_details[Genre_details['Genre_Title_count'] >= 2]
logger.debug(Genre_details)
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
Genre_details,\
left_on='Genre',\
right_on = 'Genre_Genre_',\
how='left'
)
logger.debug(merge_dataset.shape)
merge_dataset.drop('Genre_Genre_',inplace=True,axis=1)
condition = ((merge_dataset['Price'] != -1) &\
(merge_dataset['top_Genre'] != 'other_Genre'))
fig = px.scatter(merge_dataset[condition],\
x='Price',\
color='top_Genre',
log_x=True)
fig.show()
condition = ((merge_dataset['Price'] != -1))
fig = px.scatter(merge_dataset[condition],\
x='Genre_LogPrice_mean',\
y='Price',
color = 'Genre_Title_count',
log_y=True)
fig.show()
condition = (merge_dataset['Price'] != -1)
fig = px.scatter(merge_dataset[condition],\
color='BookCategory',\
x='Price',
log_x=True)
fig.show()
train_condition = merge_dataset['LogPrice'] != -1
BookCat_details = (merge_dataset[train_condition].\
groupby('BookCategory').\
agg({'Title':'count',\
'LogPrice':[np.amin,\
np.mean,\
np.amax,\
np.std]}).\
reset_index())
cols = ["BookCategory" + "_" + str(i[0]) + "_" + str(i[1]) for i in BookCat_details.columns]
BookCat_details.columns = cols
logger.debug(BookCat_details)
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
BookCat_details,\
left_on='BookCategory',\
right_on = 'BookCategory_BookCategory_',\
how='left'
)
logger.debug(merge_dataset.shape)
merge_dataset.drop('BookCategory_BookCategory_',inplace=True,axis=1)
wc = eda.create_word_cloud(merge_dataset['Synopsis'])
merge_dataset['synopsis_len'] = merge_dataset['Synopsis'].apply(lambda x: len(x.split()))
fig = px.box(y=merge_dataset['synopsis_len'])
fig.show()
logger.debug(f"There are {merge_dataset.isnull().sum().sum()} missing values")
logger.debug(f"Columns with missing values are \n{merge_dataset.\
isnull().\
sum()[merge_dataset.\
isnull().\
sum()>0]}")
NA in above values suggest absence of respective fields in training data.
replace_zero = ['Author_Title_count',\
'Author_LogPrice_std',\
'bind_Title_count',\
'bind_LogPrice_std',\
'sour_Title_count',\
'sour_LogPrice_std',\
'Genre_Title_count',\
'Genre_LogPrice_std']
merge_dataset.loc[:,replace_zero] = merge_dataset.loc[:,replace_zero].fillna(0)
def impute_values(df,impute_vals,impute_cols,func):
if func == 'min':
df['impute_vals'] = df.loc[:,impute_vals].min(axis=1)
if func == 'mean':
df['impute_vals'] = df.loc[:,impute_vals].mean(axis=1)
if func == 'max':
df['impute_vals'] = df.loc[:,impute_vals].max(axis=1)
for col in impute_cols:
df[col] = df.apply(lambda x: x['impute_vals'] if np.isnan(x[col]) else x[col],\
axis=1)
df.drop('impute_vals',inplace=True,axis=1)
return df
impute_vals = ['BookCategory_LogPrice_amin']
impute_cols = ['Author_LogPrice_amin',\
'Genre_LogPrice_amin',\
'sour_LogPrice_amin',\
'bind_LogPrice_amin']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'min')
impute_vals = ['BookCategory_LogPrice_mean']
impute_cols = ['Author_LogPrice_mean',\
'Genre_LogPrice_mean',\
'sour_LogPrice_mean',\
'bind_LogPrice_mean']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'mean')
impute_vals = ['BookCategory_LogPrice_amax']
impute_cols = ['Author_LogPrice_amax',\
'Genre_LogPrice_amax',\
'sour_LogPrice_amax',\
'bind_LogPrice_amax']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'max')
eda.explain_data(merge_dataset)
logger.debug(merge_dataset.columns)
target = 'LogPrice'
cat_cols = ['BookCategory',\
'top_Author',\
'top_bind',\
'top_sour',\
'top_Genre']
num_cols = ['title_len',\
'Author_Title_count',\
'Author_LogPrice_amin',\
'Author_LogPrice_mean',\
'Author_LogPrice_amax',\
'Author_LogPrice_std',\
'month',\
'year',\
'bind_Title_count',\
'bind_LogPrice_amin',\
'bind_LogPrice_mean',\
'bind_LogPrice_amax',\
'bind_LogPrice_std',\
'sour_Title_count',\
'sour_LogPrice_amin',\
'sour_LogPrice_mean',\
'sour_LogPrice_amax',\
'sour_LogPrice_std',\
'review_num',\
'rating_num',\
'Genre_Title_count',\
'Genre_LogPrice_amin',\
'Genre_LogPrice_mean',\
'Genre_LogPrice_amax',\
'Genre_LogPrice_std',\
'BookCategory_Title_count',\
'BookCategory_LogPrice_amin',\
'BookCategory_LogPrice_mean',\
'BookCategory_LogPrice_amax',\
'BookCategory_LogPrice_std',\
'synopsis_len']
text_cols = ['Title',\
'Synopsis']
index_cols = ['index']
merge_dataset.loc[:,cat_cols] = merge_dataset[cat_cols].astype('category')
merge_dataset.loc[:,num_cols] = merge_dataset[num_cols].astype('float64')
Adding sequence of Year within each book.As overtime, there will be some intelligence in Price build after the first book for the author is launched.
merge_dataset['year_month'] = merge_dataset['year']*100 + merge_dataset['month']
merge_dataset['title_seq'] = (merge_dataset.groupby(['Author'])['year_month'].rank())
author_title_max = (merge_dataset.\
groupby('Author')['title_seq'].\
max().\
reset_index())
author_title_max.columns = ['Author','title_seq_max']
merge_dataset = pd.merge(merge_dataset,\
author_title_max,\
on='Author')
merge_dataset['title_seq'] = merge_dataset['title_seq']/merge_dataset['title_seq_max']
merge_dataset.loc[merge_dataset['title_seq_max'] == 1,'title_seq'] = 0
logger.debug(f"\n {merge_dataset[merge_dataset['Author'] == 'Chris Kuzneski']}")
merge_dataset = merge_dataset.\
sort_values(by =['Author','year','month']).\
reset_index(drop=True)
condition = ((merge_dataset['Price'] != -1) &\
(merge_dataset['top_Author'] != 'other_Author'))
fig = px.scatter(merge_dataset[condition],\
x='year_month',\
y='Price',\
color='top_Author',\
log_y=True,
range_x=(200000,202100))
fig.show()
Adding lag in reviews to capture last 3 reviews by the author
merge_dataset['review_num_lag1'] = merge_dataset['review_num'].shift(periods=1,fill_value=0)
merge_dataset['review_num_lag2'] = merge_dataset['review_num'].shift(periods=2,fill_value=0)
merge_dataset['review_num_lag3'] = merge_dataset['review_num'].shift(periods=3,fill_value=0)
for i in range(merge_dataset.shape[0]):
if i > 3:
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-1,'Author']:
merge_dataset.loc[i,'review_num_lag1'] = 0
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-2,'Author']:
merge_dataset.loc[i,'review_num_lag2'] = 0
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-3,'Author']:
merge_dataset.loc[i,'review_num_lag3'] = 0
logger.debug(f"\n logging few to demonstrate above \n {merge_dataset[merge_dataset['Author'] == 'Satyajit Ray'].\
sort_values('title_seq')}")
Adding lag in ratings to capture last 3 reviews by the author
merge_dataset['rating_num_lag1'] = merge_dataset['rating_num'].shift(periods=1,fill_value=0)
merge_dataset['rating_num_lag2'] = merge_dataset['rating_num'].shift(periods=2,fill_value=0)
merge_dataset['rating_num_lag3'] = merge_dataset['rating_num'].shift(periods=3,fill_value=0)
for i in range(merge_dataset.shape[0]):
if i > 3:
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-1,'Author']:
merge_dataset.loc[i,'rating_num_lag1'] = 0
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-2,'Author']:
merge_dataset.loc[i,'rating_num_lag2'] = 0
if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-3,'Author']:
merge_dataset.loc[i,'rating_num_lag3'] = 0
logger.debug(f"\n logging few to demonstrate above \n {merge_dataset[merge_dataset['Author'] == 'Satyajit Ray'].\
sort_values('title_seq')}")
train_data = merge_dataset[merge_dataset['LogPrice'] != -1]
test_data = merge_dataset[merge_dataset['LogPrice'] == -1]
logger.debug(f'Shape of test_dataset is {train_data.shape}')
logger.debug(f'Shape of test_dataset is {test_data.shape}')
#Rearranging test_data orders
test_data = test_data.sort_values(by='index').reset_index(drop=True)
logger.debug(f"Top 5 values {test_data.head()}")
y = train_data[target]
X = train_data.drop(target,axis=1)
X_tr = X[X['Title'].isin(test_dataset['Title'])]
y_tr = y[X['Title'].isin(test_dataset['Title']).index]
X_train,X_val,y_train,y_val = train_test_split(X,y,random_state=123,test_size=.3)
X_train = pd.concat([X_train,X_tr]).drop_duplicates()
y_train = y[X_train.index]
logger.debug(f'Shape of X_train is {X_train.shape}')
logger.debug(f'Shape of X_val is {X_val.shape}')
logger.debug(f'Shape of Y_train is {y_train.shape}')
logger.debug(f'Shape of Y_val is {y_val.shape}')
num_cols = num_cols + ['year_month','title_seq', 'title_seq_max',\
'review_num_lag1', 'review_num_lag2',\
'review_num_lag3', 'rating_num_lag1', 'rating_num_lag2',\
'rating_num_lag3']
mapper = DataFrameMapper([([cat_col],OneHotEncoder()) for cat_col in cat_cols],\
df_out=True)
mapper.fit(X_train)
X_train_ohe = mapper.transform(X_train)
X_val_ohe = mapper.transform(X_val)
X_test_ohe = mapper.transform(test_data)
logger.debug(f'Shape of X_train_ohe is {X_train_ohe.shape}')
logger.debug(f'Shape of X_val_ohe is {X_val_ohe.shape}')
logger.debug(f'Shape of X_test_ohe is {X_test_ohe.shape}')
scl = MinMaxScaler()
scl.fit(X_train[num_cols])
X_train_std = pd.DataFrame(scl.transform(X_train[num_cols]),
columns = num_cols,
index = X_train.index)
X_val_std = pd.DataFrame(scl.transform(X_val[num_cols]),
columns=num_cols,
index=X_val.index)
test_dataset_std = pd.DataFrame(scl.transform(test_data[num_cols]),
columns=num_cols,
index=test_data.index)
logger.debug(f'Shape of X_train is {X_train_std.shape}')
logger.debug(f'Shape of X_val is {X_val_std.shape}')
logger.debug(f'Shape of X_test is {test_dataset_std.shape}')
encoding_strategy = 'tfidfTitle'
tfidf = TfidfVectorizer(ngram_range=(1,3),\
min_df=20,\
stop_words=STOP_WORDS)
X_train_title = tfidf.fit_transform(X_train['Title'])
features = [ 'Title_' + str(i) for i in tfidf.get_feature_names()]
X_train_title = pd.DataFrame(X_train_title.toarray(),\
columns=features,\
index = X_train.index)
logger.debug(f'Features are {features}')
logger.debug(f'Shape of X_train is {X_train_title.shape}')
tfidf_vocab = pd.concat([pd.Series(tfidf.get_feature_names(),\
name='features'),\
pd.Series(tfidf.idf_,\
name='idf')],axis=1)
logger.debug(f" Logging omnibus \n{tfidf_vocab[tfidf_vocab['features'] == 'complete']}")
logger.debug(f"\n{tfidf_vocab.sort_values('idf',ascending=False)}")
X_val_title = tfidf.transform(X_val['Title'])
X_val_title = pd.DataFrame(X_val_title.toarray(),\
columns=features,\
index=X_val.index)
test_data_title = tfidf.transform(test_data['Title'])
test_data_title = pd.DataFrame(test_data_title.toarray(),\
columns=features,\
index=test_data.index)
encoding_strategy = 'tfidfSynopsis' + encoding_strategy
stop_words = set(list(STOP_WORDS)+['book','world','books'])
tfidf = TfidfVectorizer(ngram_range=(1,3),\
min_df=250,\
stop_words=stop_words)
X_train_synopsis = tfidf.fit_transform(X_train['Synopsis'])
features = [ 'Synopsis_' + str(i) for i in tfidf.get_feature_names()]
X_train_synopsis = pd.DataFrame(X_train_synopsis.toarray(),\
columns=features,\
index = X_train.index)
logger.debug(f'Features are {features}')
logger.debug(f'Shape of X_train is {X_train_synopsis.shape}')
X_val_synopsis = tfidf.transform(X_val['Synopsis'])
X_val_synopsis = pd.DataFrame(X_val_synopsis.toarray(),\
columns=features,\
index=X_val.index)
test_data_synopsis = tfidf.transform(test_data['Synopsis'])
test_data_synopsis = pd.DataFrame(test_data_synopsis.toarray(),\
columns=features,\
index=test_data.index)
tfidf_vocab = pd.concat([pd.Series(tfidf.get_feature_names(),\
name='features'),\
pd.Series(tfidf.idf_,\
name='idf')],axis=1)
logger.debug(f" Logging omnibus \n{tfidf_vocab[tfidf_vocab['features'] == 'edition']}")
logger.debug(f"\n{tfidf_vocab.sort_values('idf',ascending=False)}")
encoding_strategy = 'gloveTitle' + encoding_strategy
gv = fe.GloveAutoEncoder()
X_train_title = gv.create_glove_encoding(docs=X_train['Title'],\
col_name='Title',\
max_length=9,\
epochs=200,
learning_rate=.5)
X_val_title = gv.transform_glove_encoding(X_val['Title'])
test_data_title = gv.transform_glove_encoding(test_data['Title'])
encoding_strategy = 'gloveSynopsis' + encoding_strategy
gv = fe.GloveAutoEncoder()
X_train_synopsis = gv.create_glove_encoding(docs=X_train['Synopsis'],\
col_name='Synopis',\
max_length=100,\
epochs=200,
learning_rate=.5)
X_val_synopsis = gv.transform_glove_encoding(X_val['Synopsis'])
test_data_synopsis = gv.transform_glove_encoding(test_data['Synopsis'])
X_train = pd.concat([X_train_std,\
X_train_ohe,\
X_train_title,\
X_train_synopsis],axis=1)
X_val = pd.concat([X_val_std,\
X_val_ohe,\
X_val_title,\
X_val_synopsis],axis=1)
X_test = pd.concat([test_dataset_std,\
X_test_ohe,\
test_data_title,\
test_data_synopsis],axis=1)
logger.debug(f'Shape of X_train is {X_train.shape}')
logger.debug(f'Shape of X_val is {X_val.shape}')
logger.debug(f'Shape of X_test is {X_test.shape}')
eda.explain_data(X_train)
model_desc = 'LinearReg' + encoding_strategy
lr = LinearRegression(normalize=True)
lr.fit(X_train,y_train)
logger.debug(f'R2 score is {lr.score(X_train,y_train)}')
x_train_pred = lr.predict(X_train)
x_val_pred = lr.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,\
trendline="ols",\
hover_data=[y_val.index],\
log_y=True)
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,lr,model_desc)
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,lr.predict(X_test)),\
name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
el.get_metrics()
features = X_train.columns
coef = pd.Series(lr.coef_,index=features)
logger.debug(f"Coef are \n{coef.sort_values()}")
model_desc = 'ElasticNet' + encoding_strategy
en = ElasticNet()
params = {
'alpha':np.linspace(.0001,1,3),
'l1_ratio':[.7,.9,1],
'normalize' : [False],
'selection': ['cyclic','random']
}
logger.debug(f"Params are {params}")
en = mb.build_model(en,params,X_train,y_train,X_val,model_desc,
PROJECT_PATH,scoring='neg_mean_squared_log_error',verbose=10,\
has_sample_weight=0)
x_train_pred = en.predict(X_train)
x_val_pred = en.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,\
trendline="ols",\
hover_data=[y_val.index])
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,en,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
features = X_train.columns
coef = pd.Series(en.coef_,index=features)
logger.debug(f"Coef are \n{coef.sort_values(ascending=False)}")
model_desc = 'DecisionTree' + encoding_strategy
dt = DecisionTreeRegressor()
params = {
'criterion' : ['mse'], #['mse','poisson','mae'],
'min_samples_split' : [2], #np.linspace(2,42,3,dtype='int64'),
'min_samples_leaf' : [42], #np.linspace(22,62,3,dtype='int64'),
'max_depth': [5,10,None],
'min_impurity_decrease': np.linspace(0,3,2,dtype='int64')
}
logger.debug(params)
dt = mb.build_model(dt,params,X_train,y_train,X_val,model_desc,
PROJECT_PATH,scoring='neg_mean_squared_log_error',
verbose=10)
x_train_pred = dt.predict(X_train)
x_val_pred = dt.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,\
trendline="ols")
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,dt,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
mb.plot_feature_importances(X_train,dt)
model_desc = 'RandomForest' + encoding_strategy
rf = RandomForestRegressor()
params = {
'n_estimators': [75],
'criterion' : ['mae'], #['mse','mae'],
'min_samples_split' : [3], #np.linspace(2,20,3,dtype='int64'),
'min_samples_leaf' : [3], #np.linspace(1,30,4,dtype='int64')
'min_impurity_decrease': [0], #np.linspace(0,10,3)
'max_depth' : [None],
'max_features' : [.7] #['auto',.7,'sqrt','log2']
}
logger.debug(params)
rf = mb.build_model(rf,params,X_train,y_train,X_val,model_desc,
PROJECT_PATH,scoring='neg_mean_squared_log_error',
verbose=10)
x_train_pred = rf.predict(X_train)
x_val_pred = rf.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,
trendline='ols')
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,rf,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,rf.predict(X_test)),\
name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
# #Loading the file.
# file_name = str(model_desc) + '.pkl'
# rf = pickle.load(open(os.\
# path.\
# join(PROJECT_PATH,\
# 'model',\
# file_name),'rb'))
top_features = mb.plot_feature_importances(X_train,rf)
logger.debug(f"Top 50 features are \n {top_features[:50]}")
l = int(.50*len(top_features))
top_features = top_features.loc[:l,'feature']
model_desc = 'RandomForestTopFeatures' + encoding_strategy
rf_top = mb.build_model(rf,\
params,\
X_train[top_features],\
y_train,\
X_val[top_features],\
model_desc,\
PROJECT_PATH,\
scoring='neg_mean_squared_log_error',
verbose=10)
x_train_pred = rf_top.predict(X_train[top_features])
x_val_pred = rf_top.predict(X_val[top_features])
fig = px.scatter(y=x_val_pred,\
x=y_val,
trendline='ols')
fig.show()
el.add_metrics(y_train,\
y_val,\
X_train[top_features],\
X_val[top_features],\
rf_top,\
model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
model_desc = 'SVM' + encoding_strategy
svm = SVR()
params = {
'kernel': ['rbf'], #['rbf','poly','sigmoid'],
'degree': [3], #np.linspace(3,7,3,dtype='int64'),
'C' : [1,.5,3], #np.linspace(.0001,12.5,3),
'epsilon' : [.1],#np.linspace(.1,.3,2,dtype='float64'),
'gamma': ['scale',.01,.1]
}
logger.debug(params)
svm = mb.build_model(svm,params,X_train,y_train,X_val,model_desc,
PROJECT_PATH,scoring='neg_root_mean_squared_error',
verbose=10)
x_train_pred = svm.predict(X_train)
x_val_pred = svm.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,\
trendline="ols")
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,svm,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,svm.predict(X_test)),\
name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
model_desc = 'XGBoost' + encoding_strategy
xgrf = XGBRegressor()
params = {
'n_estimators': [100],
'objective' : ['reg:squarederror'], #['mse','mae'],
'max_depth' : [4], #np.linspace(3,4,2,dtype='int64'),
'learning_rate': [.23], #np.linspace(.1,1,4),
'gamma': [0], #np.linspace(0,10,4),
'min_child_weight': [5,20], #np.linspace(1,10,3,dtype='int64'),
'colsample_bytree': [.85], #np.linspace(.4,1,4),
'subsample' : [1], #np.linspace(.7,1,3),
'max_delta_step': [0], #[0,5],
'reg_lambda' : [1], #np.linspace(1,2,2,dtype='int64'),
'reg_alpha' : [0] #np.linspace(0,2,2,dtype='int64')
}
logger.debug(params)
xgrf = mb.build_model(xgrf,params,X_train,y_train,X_val,model_desc,
PROJECT_PATH,scoring='neg_mean_squared_log_error',
verbose=10)
x_train_pred = xgrf.predict(X_train)
x_val_pred = xgrf.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,\
trendline="ols")
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,xgrf,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,xgrf.predict(X_test)),\
name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
top_features = mb.plot_feature_importances(X_train,xgrf)
l = int(.50*len(top_features))
top_features = top_features.loc[:l,'feature']
model_desc = 'XGBoostTopFeatures' + encoding_strategy
xgrf_top = mb.build_model(xgrf,\
params,\
X_train[top_features],\
y_train,\
X_val[top_features],\
model_desc,\
PROJECT_PATH,\
scoring='neg_mean_squared_log_error',\
verbose=10)
x_train_pred = xgrf_top.predict(X_train[top_features])
x_val_pred = xgrf_top.predict(X_val[top_features])
fig = px.scatter(y=x_val_pred,\
x=y_val,
trendline='ols')
fig.show()
el.add_metrics(y_train,\
y_val,\
X_train[top_features],\
X_val[top_features],\
xgrf_top,\
model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
model_desc = 'Stacking' + encoding_strategy
estimators = [
('en',en),
('dt',dt),
('rf',rf),
('rf_top',rf_top),
('svm',svm),
('xgrf',xgrf),
('xgrf_top',xgrf_top)
]
stack = StackingRegressor(
estimators=estimators,
final_estimator = XGBRegressor(objective='reg:squarederror'),
n_jobs=-1,
verbose=100)
stack.fit(X_train,y_train)
#Save model
model_name = str(model_desc) + '.pkl'
pickle.dump(stack,open(os.path.join(PROJECT_PATH,\
'model',model_name),'wb'))
x_train_pred = stack.predict(X_train)
x_val_pred = stack.predict(X_val)
fig = px.scatter(y=x_val_pred,\
x=y_val,
trendline='ols')
fig.show()
el.add_metrics(y_train,y_val,X_train,X_val,stack,model_desc)
el.get_metrics()
el.save_data(PROJECT_PATH)
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,stack.predict(X_test)),\
name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
#Loading the file.
file_name = str(model_desc) + '.pkl'
stack = pickle.load(open(os.\
path.\
join(PROJECT_PATH,\
'model',\
file_name),'rb'))
mod = stack
X_tr_temp = X_train.loc[X_tr.index,:]
y_tr_temp = y_train[X_tr_temp.index]
x_pred_temp = pd.Series(mod.predict(X_tr_temp),\
name='y_pred',\
index=X_tr_temp.index)
X_tr_merge_temp = pd.concat([X_tr_temp,y_tr_temp,x_pred_temp],\
axis=1)
X_tr_merge_temp['error'] = pow(10,\
np.\
sqrt(np.\
square(np.\
log10(1+X_tr_merge_temp['LogPrice']) - np.\
log10(1+X_tr_merge_temp['y_pred']))))
X_tr_merge_temp.sort_values('error',ascending=False,inplace=True)
top_features = mb.plot_feature_importances(X_train,rf)
n_features = top_features.loc[:15,'feature']
l = pd.Series(['LogPrice','y_pred','error'])
n_features = n_features.append(l)
X_tr_merge_temp[n_features].head()
train_data.loc[2274,:]
X_train.loc[6407,:]
fig = px.scatter(data_frame=X_tr_merge_temp,
y='Author_LogPrice_mean',\
x='error',\
hover_data=[X_tr_merge_temp.index])
fig.show()
X_train_title.loc[6407,:].sort_values(ascending=False)
X_train_synopsis.loc[6407,:].sort_values(ascending=False)
!jupyter nbconvert --to html 'MachineHack_PredictPriceBookNC.ipynb'